Let’s continue with the digits data. We read-in the data:
url <- "https://raw.githubusercontent.com/datasciencelabs/data/master/hand-written-digits-train.csv"
if(!exists("digits")) digits <- read_csv(url)
#digits <- read_csv(url)
To simplify the problem we will try to distinguish 2s from 7s. So we subset to only those digits.
dat <- digits %>% filter(label %in% c(2,7))
For illustrative purposes we created two features: X_1 is the percent of non-white pixels that are in the top left quadrant and X_2 is the percent of non-white pixels that are in the bottom right quadrant:
We can create these new predictors like we did before:
dat <- mutate(dat, label = as.character(label)) %>%
mutate(y = ifelse(label == "2",0,1 ))
row_column <- expand.grid(row=1:28, col=1:28)
ind1 <- which(row_column$col <= 14 & row_column$row <= 14)
ind2 <- which(row_column$col > 14 & row_column$row > 14)
ind <- c(ind1,ind2)
X <- as.matrix(dat[,-1])
X <- X > 200
X1 <- rowSums(X[,ind1])/rowSums(X)
X2 <- rowSums(X[,ind2])/rowSums(X)
dat <- mutate(dat, X_1 = X1, X_2 = X2)
We can see some examples of what these predictors (pixels) are:
We act as if we know the truth (the boundary that we should use to distinguish 2s from 7s):
For illustration purposes let’s take a subset of 1,000 handwritten digits:
set.seed(1)
dat <- sample_n(dat, 1000) %>% select(y, X_1, X_2)
dat
## # A tibble: 1,000 x 3
## y X_1 X_2
## <dbl> <dbl> <dbl>
## 1 1 0.238 0.417
## 2 0 0.0610 0.293
## 3 0 0.151 0.315
## 4 0 0.0957 0.255
## 5 0 0.189 0.256
## 6 0 0 0.234
## 7 0 0.151 0.256
## 8 0 0.0192 0.212
## 9 1 0.129 0.186
## 10 0 0.0854 0.207
## # … with 990 more rows
Now create train and test sets:
inTrain <- createDataPartition(y = dat$y, p = 0.5)
head(inTrain)
## $Resample1
## [1] 1 7 8 9 11 13 14 15 16 20 23 27 29 33 34 36 37 39
## [19] 40 41 42 43 46 47 52 55 56 60 64 65 66 67 68 69 73 74
## [37] 75 79 80 82 86 87 90 93 94 95 96 97 98 103 104 106 107 113
## [55] 115 116 117 118 120 121 124 127 130 132 133 136 137 138 139 140 141 142
## [73] 143 144 146 152 157 158 161 165 166 171 173 175 180 181 185 187 190 191
## [91] 192 194 196 202 203 204 205 206 207 213 214 216 217 219 220 223 224 225
## [109] 227 228 231 232 234 235 236 240 241 243 244 245 248 250 251 255 258 264
## [127] 265 266 267 271 273 274 275 276 278 281 287 289 291 292 293 296 298 302
## [145] 303 304 305 306 309 315 318 323 326 327 335 336 338 340 341 343 346 347
## [163] 348 350 352 353 354 355 360 361 363 364 365 368 371 372 373 375 376 377
## [181] 378 379 382 383 386 387 390 391 397 400 401 403 404 406 408 410 411 412
## [199] 413 414 416 417 421 427 428 430 431 432 435 437 438 440 442 444 445 450
## [217] 451 452 454 455 458 459 462 463 464 469 474 476 479 481 482 484 486 487
## [235] 488 489 490 492 493 495 500 501 502 503 504 505 508 510 512 513 515 516
## [253] 517 519 520 522 528 529 530 531 534 536 537 538 541 544 547 549 551 552
## [271] 553 554 555 556 559 561 566 567 568 569 571 572 574 576 578 579 580 581
## [289] 582 583 586 590 591 592 593 595 596 602 606 607 608 610 611 613 614 615
## [307] 617 623 624 625 626 629 630 631 633 636 637 639 640 641 642 643 645 646
## [325] 647 649 650 653 655 656 657 658 661 662 665 666 669 670 671 672 673 675
## [343] 676 677 678 679 681 682 683 687 689 690 692 694 700 705 710 711 712 713
## [361] 714 717 718 720 721 722 723 725 726 727 730 735 737 739 744 745 746 749
## [379] 750 751 756 757 759 761 763 766 768 770 774 775 777 778 780 785 786 788
## [397] 790 791 792 797 799 800 801 802 804 806 807 808 809 810 812 814 815 816
## [415] 817 818 819 820 822 823 824 825 827 828 832 836 837 838 842 844 845 848
## [433] 852 854 855 856 859 861 865 866 868 871 874 877 878 882 883 884 886 887
## [451] 891 895 898 899 901 904 905 907 908 909 910 913 918 919 920 922 923 931
## [469] 932 933 937 938 940 942 943 945 950 955 956 957 960 961 962 963 964 968
## [487] 970 973 976 978 980 981 982 983 988 993 994 996 997 999
train_set <- slice(dat, inTrain$Resample1)
test_set <- slice(dat, -inTrain$Resample1)
Quadratic Discriminant Analysis (QDA) relates to the Naive Bayes approach we described earlier. We try to estimate \(\mbox{Pr}(Y=1|X=x)\) using Bayes theorem.
\[ p(x) = \mbox{Pr}(Y=1|\mathbf{X}=\mathbf{x}) = \frac{\pi f_{\mathbf{X}|Y=1}(x)} {(1-\pi) f_{\mathbf{X}|Y=0}(x) + \pi f_{\mathbf{X}|Y=1}(x)} \]
With QDA we assume that the distributions \(f_{\mathbf{X}|Y=1}(x)\) and \(f_{\mathbf{X}|Y=0}(x)\) are multivariate normal. In our case we have two predictors so we assume each one is bivariate normal. This implies we need to estimate two averages, two standard deviations, and a correlation for each case: \(Y=1\) and \(Y=0\).
This implies that we can approximate the distributions \(f_{X_1,X_2|Y=1}\) and \(f_{X_1, X_2|Y=0}\). We can easily estimate parameters from the data:
options(digits = 2)
params <- train_set %>% group_by(y) %>%
summarize(avg_1 = mean(X_1), avg_2 = mean(X_2), sd_1= sd(X_1), sd_2 = sd(X_2), r = cor(X_1,X_2))
params
## # A tibble: 2 x 6
## y avg_1 avg_2 sd_1 sd_2 r
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.130 0.286 0.0674 0.0592 0.436
## 2 1 0.232 0.292 0.0750 0.107 0.360
So here are the data and contour plots showing the two normal densities:
train_set %>% mutate(y = factor(y)) %>%
ggplot(aes(X_1, X_2, fill = y, color = y)) +
geom_point(pch = 21, cex = 5, color = "black") +
stat_ellipse(lwd = 2, type = "norm")
This defines the following estimate of \(f(x_1, x_2)\)
library(mvtnorm)
get_p <- function(params, data){
dmvnorm( cbind(data$X_1, data$X_2),
mean = c(params$avg_1, params$avg_2),
sigma = matrix( c(params$sd_1^2,
params$sd_1*params$sd_2*params$r,
params$sd_1*params$sd_2*params$r,
params$sd_2^2),2,2))
}
pi <- 0.5
p0 <- get_p(params[1,], true_f)
p1 <- get_p(params[2,], true_f)
f_hat_qda <- pi*p1/(pi*p1 + (1-pi)*p0)
p <-true_f %>% mutate(f=f_hat_qda) %>%
ggplot(aes(X_1, X_2, fill=f)) +
scale_fill_gradientn(colors=c("#F8766D","white","#00BFC4")) +
geom_raster() +
stat_contour(aes(x=X_1,y=X_2,z=f),
data=mutate(true_f, f=f_hat_qda),
breaks=c(0.5),color="black",lwd=1.5)
grid.arrange(true_f_plot, p, nrow=1)
This fits the “truth” extremely well. We can perform QDA automatically using the
qda function form the MASS package.
qda_fit <- qda(y ~ X_1 + X_2, data = train_set)
qda_preds <- predict(qda_fit, test_set)
Let’s see how well we preform on the test set. Note that the class prediction (0 or 1) can be accessed using $class after using the predict function to make predictions. We get an overall accuracy of 0.85 and balanced sensitivity and specificity.
confusionMatrix(data = as.factor(qda_preds$class), reference = as.factor(test_set$y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 219 35
## 1 40 206
##
## Accuracy : 0.85
## 95% CI : (0.816, 0.88)
## No Information Rate : 0.518
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7
##
## Mcnemar's Test P-Value : 0.644
##
## Sensitivity : 0.846
## Specificity : 0.855
## Pos Pred Value : 0.862
## Neg Pred Value : 0.837
## Prevalence : 0.518
## Detection Rate : 0.438
## Detection Prevalence : 0.508
## Balanced Accuracy : 0.850
##
## 'Positive' Class : 0
##
Here we have 2 predictors and had to compute 4 means, 4 SDs and 2 correlations. How many parameters would we have if instead of 2 predictors we had 10?
The main problem comes from estimating correlations for 10 predictors. With 10, we have 45 correlations for each class. In general the formula is \(p(p-1)/2\) which gets big quickly.
A solution to this is to assume that the correlation structure is the same for all classes. This reduces the number of parameters we need to estimate. When we do this, we can show mathematically that the solution is “linear”, in the linear algebra sense, and we call it Linear Discriminant Analysis (LDA).
params <- train_set %>% group_by(y) %>%
summarize(avg_1 = mean(X_1), avg_2 = mean(X_2), sd_1= sd(X_1), sd_2 = sd(X_2), r = cor(X_1,X_2))
params <- params %>% mutate(sd_1 = mean(sd_1), sd_2=mean(sd_1), r=mean(r))
params
## # A tibble: 2 x 6
## y avg_1 avg_2 sd_1 sd_2 r
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.130 0.286 0.0712 0.0712 0.398
## 2 1 0.232 0.292 0.0712 0.0712 0.398
This defines the following estimate of \(f(x_1, x_2)\) and the boundary becomes linear:
p0 <- get_p(params[1,], data=true_f)
p1 <- get_p(params[2,], data=true_f)
p <- 0.5
f_hat_lda <- pi*p1/(pi*p1 + (1-pi)*p0)
p <- true_f %>% mutate(f=f_hat_lda) %>%
ggplot(aes(X_1, X_2, fill=f)) +
scale_fill_gradientn(colors=c("#F8766D","white","#00BFC4")) +
geom_raster() +
stat_contour(aes(x=X_1,y=X_2,z=f),
data=mutate(true_f, f=f_hat_lda),
breaks=c(0.5),color="black",lwd=1.5)
grid.arrange(true_f_plot, p, nrow=1)
With this simplification we gain computation speed and feasibility, but lose accuracy. We can fit an LDA model automatically using the lda function, similar to the qda function. We see a dip in overall accuracy to 0.816.
lda_fit <- lda(y ~ X_1 + X_2, data = train_set)
lda_preds <- predict(lda_fit, test_set)
confusionMatrix(data = as.factor(lda_preds$class), reference = as.factor(test_set$y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 207 40
## 1 52 201
##
## Accuracy : 0.816
## 95% CI : (0.779, 0.849)
## No Information Rate : 0.518
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.632
##
## Mcnemar's Test P-Value : 0.251
##
## Sensitivity : 0.799
## Specificity : 0.834
## Pos Pred Value : 0.838
## Neg Pred Value : 0.794
## Prevalence : 0.518
## Detection Rate : 0.414
## Detection Prevalence : 0.494
## Balanced Accuracy : 0.817
##
## 'Positive' Class : 0
##
With the example we have been examining we can make two types of errors: calling a 2 a 7 or calling a 7 a 2. More generally, with binary data we call these false positives (calling a 0 a 1) and false negatives (calling a 1 a 0). Here we have arbitrarily made 7s 1s and 2s 0s.
This concept is important in many areas and in particular in health where one type of mistake can be much more costly than another. Note that we have been predicting 1s based on the rule \(\hat{f}(x_1, x_2) > 0.5\) but we can pick another cutoff, depending on the cost of each type of error. For example, if we are predicting if a plane will malfunction, then we want a very low false negative rate and are willing to sacrifice our true positive rate.
We can see that the estimated probabilities are on a continuum:
params <- train_set %>% group_by(y) %>%
summarize(avg_1 = mean(X_1), avg_2 = mean(X_2),
sd_1= sd(X_1), sd_2 = sd(X_2),
r = cor(X_1,X_2))
p0 <- get_p(params[1,], dat= test_set)
p1 <- get_p(params[2,], dat= test_set)
pi <- 0.5
pred_qda <- pi*p1/(pi*p1 + (1-pi)*p0)
test_set %>% mutate(pred=pred_qda, label=as.factor(y)) %>%
ggplot(aes(label,pred)) + geom_boxplot()
The Receiver Operator Characteristic Curve (ROC Curve) plots true positive rate versus false positive rate for several choices of cutoff. We can create this curve with the following code:
library(pROC)
roc_qda <- roc(test_set$y, pred_qda)
plot(roc_qda)
Here are the results for LDA
params <-params %>% mutate(sd_1 = mean(sd_1),
sd_2 = mean(sd_1),
r=mean(r))
p0 <- get_p(params[1,], dat = test_set)
p1 <- get_p(params[2,], dat = test_set)
pi <- 0.5
pred_lda <- pi*p1/(pi*p1 + (1-pi)*p0)
roc_lda <- roc(test_set$y, pred_lda)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_qda)
plot(roc_lda, add=TRUE, col=2)
legend(0, 0.5, legend=c("QDA", "LDA"),
col=c("black", "red"), lty=1)
We can also compare to KNN with two different values of k: 5 and 51.
fit <- knn3(y~., data = train_set, k = 5)
pred_knn_5 <- predict(fit, newdata = test_set)[,2]
roc_knn_5 <- roc(test_set$y, pred_knn_5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_qda)
plot(roc_knn_5, add=TRUE, col=3)
legend(0.2, 0.5, legend=c("QDA", "KNN, k = 5"),
col=c("black", 3), lty=1)
fit <- knn3(y~., data = train_set, k = 51)
pred_knn_51 <- predict(fit, newdata = test_set)[,2]
roc_knn_51 <- roc(test_set$y, pred_knn_51)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_qda)
plot(roc_knn_51, add=TRUE, col=4)
legend(0.2, 0.5, legend=c("QDA", "KNN, k = 51"),
col=c("black", 4), lty=1)
To summarize these curves into one single number that can be compared across methods, it is common to take the area under the ROC curve (abbreviated AUC or AUROC). Higher values indicate better performance.
auc(roc_qda)
## Area under the curve: 0.92
auc(roc_lda)
## Area under the curve: 0.89
auc(roc_knn_5)
## Area under the curve: 0.88
auc(roc_knn_51)
## Area under the curve: 0.92
Here we look at an example where there are three classes to consider. In addition to 2s and 7s, we also consider 1s. Now we need to estimate three different curves that represent the conditional probabilities of being each digit given the values of \(X_1\) and \(X_2\).
We’ll create a training set and test set so we can test out how each method does on the three class problem.
set.seed(1)
dat <- sample_n(dat, 3000) %>% select(label, X_1, X_2) %>% mutate(label = as.factor(label))
inTrain <- createDataPartition(y = dat$label, p = 0.5)
train_set <- slice(dat, inTrain$Resample1)
test_set <- slice(dat, -inTrain$Resample1)
train_set %>% ggplot(aes(X_1,X_2,fill=label)) + geom_point(cex=3, pch=21)
params <- train_set %>% group_by(label) %>%
summarize(avg_1 = mean(X_1), avg_2 = mean(X_2), sd_1= sd(X_1), sd_2 = sd(X_2), r = cor(X_1,X_2))
params <-params %>% mutate(sd_1 = mean(sd_1), sd_2=mean(sd_1), r=mean(r))
params
## # A tibble: 3 x 6
## label avg_1 avg_2 sd_1 sd_2 r
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.0708 0.246 0.0826 0.0826 0.436
## 2 2 0.132 0.280 0.0826 0.0826 0.436
## 3 7 0.240 0.292 0.0826 0.0826 0.436
This defines the following estimate of \(f(x_1, x_2)\) and the boundary becomes linear:
p0 <- get_p(params[1,], true_f)
p1 <- get_p(params[2,], true_f)
p2 <- get_p(params[3,], true_f)
pred <- apply(cbind(p0, p1, p2),1,which.max)
tmp <- true_f %>% mutate(pred=pred)
tmp %>% ggplot() +
stat_contour(aes(x=X_1,y=X_2,z=pred),
breaks=c(1,2,3),color="black",lwd=1.5) +
geom_point(aes(X_1,X_2,fill=label), dat=test_set,pch=21)
params <- train_set %>% group_by(label) %>%
summarize(avg_1 = mean(X_1), avg_2 = mean(X_2), sd_1= sd(X_1), sd_2 = sd(X_2), r = cor(X_1,X_2))
This defines the following estimate of \(f(x_1, x_2)\) and the boundary becomes non-linear:
p0 <- get_p(params[1,], true_f)
p1 <- get_p(params[2,], true_f)
p2 <- get_p(params[3,], true_f)
pred <- apply(cbind(p0, p1, p2),1,which.max)
tmp <- true_f %>% mutate(pred=pred)
tmp %>% ggplot() +
stat_contour(aes(x=X_1,y=X_2,z=pred),
breaks=c(1,2,3),color="black",lwd=1.5) +
geom_point(aes(X_1,X_2,fill=label), dat=test_set,pch=21)
Let’s compare the performance of LDA and QDA with logistic regression. For each label, we will train a model to predict that number, or not that number. So fit1 predicts 1 vs not 1, fit2 predicts 2 vs not 2, and fit7 predicts 7 vs not 7. Then we can also plot these boundaries.
fit1 <- glm(y~X_1+X_2, data=mutate(train_set,
y=label=="1"),family="binomial")
fit2 <- glm(y~X_1+X_2, data=mutate(train_set,
y=label=="2"),family="binomial")
fit7 <- glm(y~X_1+X_2, data=mutate(train_set,
y=label=="7"),family="binomial")
f_hat1 <- predict(fit1, newdata = true_f, type = "response")
f_hat2 <- predict(fit2, newdata = true_f, type ="response")
f_hat7 <- predict(fit7, newdata = true_f, type = "response")
pred <- apply(cbind(f_hat1, f_hat2, f_hat7),1,which.max)
tmp <- true_f %>% mutate(pred=pred)
tmp %>% ggplot() +
stat_contour(aes(x=X_1,y=X_2,z=pred),
breaks=c(1,2,3),color="black",lwd=1.5) +
geom_point(aes(X_1,X_2,fill=label), dat=test_set,pch=21)
Let’s also use kNN with k = 51 and k = 101 and draw the boundaries.
fit <- knn3(label~., data=train_set, k=51)
f_hat <- predict(fit, newdata = true_f)
f_hat_max <- apply(f_hat,1,max)
pred <- apply(f_hat,1,which.max)
tmp <- true_f %>% mutate(pred=pred)
tmp %>% ggplot() +
stat_contour(aes(x=X_1,y=X_2,z=pred),
breaks=c(1,2,3),color="black",lwd=1.5) +
geom_point(aes(X_1,X_2,fill=label), dat=test_set,pch=21)
fit <- knn3(label~., data=train_set, k=101)
f_hat <- predict(fit, newdata = true_f)
f_hat_max <- apply(f_hat,1,max)
pred <- apply(f_hat,1,which.max)
tmp <- true_f %>% mutate(pred=pred)
tmp %>% ggplot() +
stat_contour(aes(x=X_1,y=X_2,z=pred),
breaks=c(1,2,3),color="black",lwd=1.5) +
geom_point(aes(X_1,X_2,fill=label), dat=test_set,pch=21)
After training each model we can predict lables for the test set and then compare the accuracy of each method.
##QDA
params <- train_set %>% group_by(label) %>%
summarize(avg_1 = mean(X_1), avg_2 = mean(X_2), sd_1= sd(X_1), sd_2 = sd(X_2), r = cor(X_1,X_2))
p0 <- get_p(params[1,], test_set)
p1 <- get_p(params[2,], test_set)
p2 <- get_p(params[3,], test_set)
pred_qda <- apply(cbind(p0, p1, p2),1,which.max)
##LDA
params <-params %>% mutate(sd_1 = mean(sd_1), sd_2=mean(sd_1), r=mean(r))
p0 <- get_p(params[1,], test_set)
p1 <- get_p(params[2,], test_set)
p2 <- get_p(params[3,], test_set)
pred_lda <- apply(cbind(p0, p1, p2),1,which.max)
##GLM
fit1 <- glm(y~X_1+X_2, data=mutate(train_set,
y=label=="1"),family="binomial")
fit2 <- glm(y~X_1+X_2, data=mutate(train_set,
y=label=="2"),family="binomial")
fit7 <- glm(y~X_1+X_2, data=mutate(train_set,
y=label=="7"),family="binomial")
f_hat1 <- predict(fit1, newdata = test_set, type = "response")
f_hat2 <- predict(fit2, newdata = test_set, type ="response")
f_hat7 <- predict(fit7, newdata = test_set, type = "response")
pred_glm <- apply(cbind(f_hat1, f_hat2, f_hat7),1,which.max)
##KNN 51
fit <- knn3(label~., data=train_set, k=51)
f_hat <- predict(fit, newdata = test_set)
pred_knn_51 <- apply(f_hat,1,which.max)
##KNN 101
fit <- knn3(label~., data=train_set, k=101)
f_hat <- predict(fit, newdata = test_set)
pred_knn_101 <- apply(f_hat,1,which.max)
Let’s compare:
tab <- table(factor(pred_lda, labels=c("1","2","7")), test_set$label)
confusionMatrix(tab)
## Confusion Matrix and Statistics
##
##
## 1 2 7
## 1 393 155 26
## 2 67 210 89
## 7 71 97 391
##
## Overall Statistics
##
## Accuracy : 0.663
## 95% CI : (0.639, 0.687)
## No Information Rate : 0.354
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.492
##
## Mcnemar's Test P-Value : 3.99e-12
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 7
## Sensitivity 0.740 0.455 0.773
## Specificity 0.813 0.850 0.831
## Pos Pred Value 0.685 0.574 0.699
## Neg Pred Value 0.851 0.778 0.878
## Prevalence 0.354 0.308 0.338
## Detection Rate 0.262 0.140 0.261
## Detection Prevalence 0.383 0.244 0.373
## Balanced Accuracy 0.777 0.652 0.802
confusionMatrix(tab)$overall[1]
## Accuracy
## 0.66
tab <- table(factor(pred_qda, labels=c("1","2","7")), test_set$label)
confusionMatrix(tab)$overall[1]
## Accuracy
## 0.74
tab <- table(factor(pred_glm, labels=c("1","2","7")), test_set$label)
confusionMatrix(tab)$overall[1]
## Accuracy
## 0.61
tab <- table(factor(pred_knn_51, labels=c("1","2","7")), test_set$label)
confusionMatrix(tab)$overall[1]
## Accuracy
## 0.75
tab <- table(factor(pred_knn_101, labels=c("1","2","7")), test_set$label)
confusionMatrix(tab)$overall[1]
## Accuracy
## 0.74